Multiples factors play an important function during the naive CD4-T cell differentiation where Blimp1 (encoded by Prdm1) and Bcl6 are the masters. In this notebook, we are going to characterize the accessibility pattern of these regulators.
library(Seurat)
library(Signac)
library(GenomicRanges)
library(pheatmap)
library(JASPAR2020)
library(TFBSTools)
library(dplyr)
library(ggplot2)
library(tidyverse)
library(data.table)
library(ggpubr)
library(writexl)
library(plyr)
library(stringr)
library(rio)
library(biomaRt)
cell_type <- "CD4_T"
path_to_obj <- paste0(
here::here("scATAC-seq/results/R_objects/level_5/"),
cell_type,
"/04.",
cell_type,
"_integration_peak_calling_level_5.rds",
sep = ""
)
path_to_RNA <- paste0(
here::here("scRNA-seq/3-clustering/5-level_5/"),
cell_type,
"/",
cell_type,
"_subseted_annotated_level_5.rds",
sep = ""
)
upstream <- 2000
color_palette <- c("#1CFFCE", "#90AD1C", "#C075A6", "#85660D",
"#5A5156", "#AA0DFE", "#F8A19F", "#F7E1A0",
"#1C8356", "#FEAF16", "#822E1C", "#C4451C",
"#1CBE4F", "#325A9B", "#F6222E", "#FE00FA",
"#FBE426", "#16FF32", "black", "#3283FE",
"#B00068", "#DEA0FD", "#B10DA1", "#E4E1E3",
"#90AD1C", "#FE00FA", "#85660D", "#3B00FB",
"#822E1C", "coral2", "#1CFFCE", "#1CBE4F",
"#3283FE", "#FBE426", "#F7E1A0", "#325A9B",
"#2ED9FF", "#B5EFB5", "#5A5156", "#DEA0FD",
"#FEAF16", "#683B79", "#B10DA1", "#1C7F93",
"#F8A19F", "dark orange", "#FEAF16",
"#FBE426", "Brown")
plot_dim <- function(seurat, group){
DimPlot(seurat,
group.by = group,
cols = color_palette,
pt.size = 0.1,raster=FALSE)
}
mat_heatmap <- function(seurat, features, group,
cutree_ncols,cutree_nrows){
my_sample_col <- data.frame(Group = rep(c("Naive", "Central Memory",
"Central Memory","Non-Tfh",
"Non-Tfh","Non-Tfh",
"Non-Tfh","Non-Tfh",
"Non-Tfh","Tfh",
"Tfh","Tfh",
"Tfh","Tfh")))
rownames(my_sample_col) = c("Naive", "CM Pre-non-Tfh","CM PreTfh",
"T-Trans-Mem","T-Eff-Mem","T-helper",
"Eff-Tregs","non-GC-Tf-regs","GC-Tf-regs" ,
"B border_Tfh T", "Tfh-LZ-GC",
"GC-Tfh-SAP","GC-Tfh-0X40","Tfh-Mem")
annoCol<-list(Group=c("Naive"="grey", "Central Memory"="black",
"Non-Tfh"="red", "Tfh"="yellow"))
avgexpr_mat <- AverageExpression(
features = features,
seurat,
assays = "peaks_level_5",
return.seurat = F,
group.by = group,
slot = "data")
p1 <- pheatmap(avgexpr_mat$peaks_level_5[,c(rownames(my_sample_col))],
scale = "row",
angle_col = 45,
show_rownames=T,
annotation_col = my_sample_col,
annotation_colors = annoCol,
border_color = "white",
cluster_rows = T,
cluster_cols = F,
fontsize_col = 10,
clustering_distance_rows = "euclidean",
clustering_method = "ward.D2",
cutree_rows = cutree_nrows)
return(p1)}
ensembl <- useMart(biomart = "ensembl",dataset="hsapiens_gene_ensembl")
Datasets <- listDatasets(ensembl)
Datasets[grep("hsapiens_gene_ensembl",Datasets$dataset),]
## dataset description version
## 80 hsapiens_gene_ensembl Human genes (GRCh38.p13) GRCh38.p13
hg38.gene.annot <- getBM(attributes = c("ensembl_gene_id",
"entrezgene_id",
"hgnc_symbol",
"chromosome_name",
"start_position",
"end_position",
"strand","band",
"gene_biotype"),
mart = ensembl)
## Arrange also strand info.
hg38.gene.annot$chromosome_name <- paste0("chr",hg38.gene.annot$chromosome_name)
hg38.gene.annot$strand[which(hg38.gene.annot$strand==1)] <- "+"
hg38.gene.annot$strand[which(hg38.gene.annot$strand=="-1")] <- "-"
##polish gene annotation
hg38.gene.annot$hgnc_symbol[which(hg38.gene.annot$hgnc_symbol=="")] <- NA
##make GRange object
hg38.gene.annot.GR <- GRanges(seqnames = hg38.gene.annot$chromosome_name,
ranges = IRanges(hg38.gene.annot$start_position,
end = hg38.gene.annot$end_position),
strand = hg38.gene.annot$strand)
mcols(hg38.gene.annot.GR) <- hg38.gene.annot[,grep("^chromosome_name$|^start_position$|^end_position|^strand$$",
colnames(hg38.gene.annot),value = T,invert = T)]
hg38.gene.annot.GR <- sort(sortSeqlevels(hg38.gene.annot.GR))
## Extend 2,000bps upstream of promoters
hg38.gene.annot.2000.GR <- punion(promoters(x = hg38.gene.annot.GR,
upstream = 2000,
downstream = 0),
hg38.gene.annot.GR)
hg38.gene.annot.2000.GR$gene_name <- hg38.gene.annot.GR$hgnc_symbol
seurat <- readRDS(path_to_obj)
seurat_peaks <- seurat@assays$peaks_level_5@ranges
seurat
## An object of class Seurat
## 93116 features across 16383 samples within 1 assay
## Active assay: peaks_level_5 (93116 features, 92407 variable features)
## 3 dimensional reductions calculated: umap, lsi, harmony
plot_dim(seurat, group = "annotation_paper")
seurat_RNA <- readRDS(path_to_RNA)
seurat_RNA
## An object of class Seurat
## 37378 features across 52307 samples within 1 assay
## Active assay: RNA (37378 features, 0 variable features)
## 3 dimensional reductions calculated: pca, umap, harmony
plot_dim(seurat_RNA, group = "annotation_paper")
At low level of resolution, we want to detect the main epigenomic changes between Non-Tfh vs Tfh. For this reason, we decide to group the cells in 4 clusters: Non-Tfh, Tfh, Central Memory and Naive.
seurat@meta.data <- seurat@meta.data %>% mutate(Group =
case_when(annotation_paper == "Naive" ~ "Naive",
annotation_paper == "CM Pre-non-Tfh" ~ "Central Memory",
annotation_paper == "CM PreTfh" ~ "Central Memory",
annotation_paper == "T-Trans-Mem" ~ "Non-Tfh",
annotation_paper == "T-Eff-Mem" ~ "Non-Tfh",
annotation_paper == "T-helper" ~ "Non-Tfh",
annotation_paper == "Tfh T:B border" ~ "Tfh",
annotation_paper == "Tfh-LZ-GC" ~ "Tfh",
annotation_paper == "GC-Tfh-SAP" ~ "Tfh",
annotation_paper == "GC-Tfh-0X40" ~ "Tfh",
annotation_paper == "Tfh-Mem" ~ "Tfh",
annotation_paper == "Memory T cells" ~ "Non-Tfh",
annotation_paper == "Eff-Tregs" ~ "Non-Tfh",
annotation_paper == "non-GC-Tf-regs" ~ "Non-Tfh",
annotation_paper == "GC-Tf-regs" ~ "Non-Tfh"))
plot_dim(seurat, group = "Group")
seurat_RNA@meta.data <- seurat_RNA@meta.data %>% mutate(Group =
case_when(annotation_paper == "Naive" ~ "Naive",
annotation_paper == "CM Pre-non-Tfh" ~ "Central Memory",
annotation_paper == "CM PreTfh" ~ "Central Memory",
annotation_paper == "T-Trans-Mem" ~ "Non-Tfh",
annotation_paper == "T-Eff-Mem" ~ "Non-Tfh",
annotation_paper == "T-helper" ~ "Non-Tfh",
annotation_paper == "Tfh T:B border" ~ "Tfh",
annotation_paper == "Tfh-LZ-GC" ~ "Tfh",
annotation_paper == "GC-Tfh-SAP" ~ "Tfh",
annotation_paper == "GC-Tfh-0X40" ~ "Tfh",
annotation_paper == "Tfh-Mem" ~ "Tfh",
annotation_paper == "Memory T cells" ~ "Non-Tfh",
annotation_paper == "Eff-Tregs" ~ "Non-Tfh",
annotation_paper == "non-GC-Tf-regs" ~ "Non-Tfh",
annotation_paper == "GC-Tf-regs" ~ "Non-Tfh"))
plot_dim(seurat_RNA, group = "Group")
The main goal of this analisys is to extract the differential expressed genes between the groups previously defined.
Idents(seurat_RNA) <- "Group"
#DE <- FindAllMarkers(
# object = seurat_RNA,
#logfc.threshold = 0.25,
#test.use = "wilcox")
#output <- split(DE, DE$cluster)
path_to_save_DE <- here::here("scATAC-seq/results/files/CD4_T/DE_4_groups.xlsx")
#write_xlsx(output, path_to_save_DE)
#DT::datatable(DE)
DE <- import_list(path_to_save_DE, setclass = "tbl", rbind = TRUE)
colnames(DE) <- c("p_val", "avg_log2FC",
"pct.1" , "pct.2", "p_val_adj",
"cluster", "gene_name", "_file")
Tfh_genes <- c("TOX2", "PDCD1","CXCL13","TOX","BCL6",
"GNG4","IL21","SH2D1A","CD200","CXCR5","POU2AF1")
Naive_genes <- c("BACH2","LEF1","CCR7","NOSIP","KLF2","SELL")
Central_memory_genes <- c("ANK3","IL7R","TXNIP","ANXA1",
"ZBTB16","GPR183","TIGIT","IL21")
Non_Tfh_genes <- c("LAG3","RORA","IKZF2","KLRB1",
"IL2RA","PRDM1","IL1R1","CTLA4",
"FOXP3","CCR6","MAF","CCL20","IL1R2")
target_genes <- c(Naive_genes,Central_memory_genes,
Tfh_genes,Non_Tfh_genes)
expr_mat <- AverageExpression(
features = target_genes,
seurat_RNA,
return.seurat = F,
group.by = "Group",
slot = "data")
pheatmap(expr_mat$RNA[target_genes,],
scale = "row",
annotation_names_row = F,
border_color = "white",
cluster_rows = F,
cluster_cols = T,
fontsize_row = 10,
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "ward.D2")
#write.table(unique(expr_mat$RNA[target_genes,]),
# quote = F,
# here::here("scATAC-seq/results/plots/CD4-T/files_plots/matrix_RNA_genes.tsv"))
hg38.gene.annot_targetted <- hg38.gene.annot.2000.GR[which(hg38.gene.annot.2000.GR$gene_name %in% target_genes),]
## Overlapping of the DE genes coordinates with the total number of peaks detected.
gr1 <- seurat_peaks
gr2 <- hg38.gene.annot_targetted
m <- findOverlaps(gr1, gr2)
gr1.matched <- gr1[queryHits(m)]
# Add the metadata from gr2
mcols(gr1.matched) <- cbind.data.frame(
mcols(gr1.matched),
mcols(gr2[subjectHits(m)]));
gr1.matched
## GRanges object with 515 ranges and 1 metadata column:
## seqnames ranges strand | gene_name
## <Rle> <IRanges> <Rle> | <character>
## [1] chr1 145996409-145997493 * | TXNIP
## [2] chr1 145998144-145998527 * | TXNIP
## [3] chr1 169692335-169693366 * | SELL
## [4] chr1 169694288-169694677 * | SELL
## [5] chr1 169699061-169699292 * | SELL
## ... ... ... ... . ...
## [511] chrX 124328812-124329858 * | SH2D1A
## [512] chrX 124339089-124339448 * | SH2D1A
## [513] chrX 124342323-124342811 * | SH2D1A
## [514] chrX 124346063-124346707 * | SH2D1A
## [515] chrX 124358836-124359410 * | SH2D1A
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
gr1.matched$peaks <- paste0(seqnames(gr1.matched),"-",
start(gr1.matched),"-",
end(gr1.matched))
gr1.matched_df <- as.data.frame(gr1.matched)
my_sample_col <- data.frame(Gene = c(gr1.matched$gene_name))
rownames(my_sample_col) = unique(gr1.matched$peaks)
avgexpr_mat <- AverageExpression(
features = unique(gr1.matched$peaks),
seurat,
assays = "peaks_level_5",
return.seurat = F,
group.by = "Group",
slot = "data")
avgexpr_df <- as.data.frame(avgexpr_mat$peaks_level_5)
avgexpr_df$peaks <- row.names(avgexpr_df)
DA_DE_merge <- merge(avgexpr_df,
gr1.matched_df[c("peaks","gene_name")],
by=c("peaks"))
DA_DE_merge_melt <- melt(DA_DE_merge)
# Computing the mean accessibility/expression per gene
mean_accessibility <- tapply(DA_DE_merge_melt$value,
list(DA_DE_merge_melt$gene_name, DA_DE_merge_melt$variable),
mean)
out <- pheatmap(mean_accessibility[target_genes,],
scale = "row",
border_color = "white",
cluster_rows = F,
cluster_cols = T,
fontsize_row = 10,
clustering_distance_cols = "euclidean",
clustering_method = "ward.D2")
#write.table(unique(mean_accessibility[target_genes,]),
# quote = F,
# here::here("scATAC-seq/results/plots/CD4-T/files_plots/matrix_ATAC_genes.tsv"))
DA_DE_merge <- merge(melt(mean_accessibility),
melt(expr_mat$RNA),
by=c("Var1","Var2"))
colnames(DA_DE_merge) <- c("Gene" ,"Clusters","Accesibility","Expresion")
DA_DE_merge.melt <- melt(DA_DE_merge)
# Filtering conditions
df_Naive <- filter(
DA_DE_merge.melt,Clusters == "Naive" & Gene %in% Naive_genes)
df_CM <- filter(
DA_DE_merge.melt,Clusters == "Central Memory" & Gene %in% Central_memory_genes)
df_Tfh <- filter(
DA_DE_merge.melt,Clusters == "Tfh" & Gene %in% Tfh_genes)
df_Non_Tfn <- filter(
DA_DE_merge.melt,Clusters == "Non-Tfh" & Gene %in% Non_Tfh_genes)
selection_df <- rbind(df_Naive,df_CM,df_Tfh,df_Non_Tfn)
selection_df$value <- scale(selection_df$value)
ggdotchart(selection_df,
x="Gene",
y="value",
add = "segments") +
coord_flip() + facet_grid(vars(Clusters), vars(variable), scales = "free_y")
bcl6 <- hg38.gene.annot.2000.GR[which(hg38.gene.annot.2000.GR$gene_name %in% "BCL6"),]
bcl6_gr <- makeGRangesFromDataFrame(bcl6)
bcl6_plot <- CoveragePlot(
object = seurat,
region = bcl6_gr)
bcl6_plot
overlapping_bcl6 <- seurat_peaks[queryHits(findOverlaps(seurat_peaks,
bcl6_gr)),]
features <- paste0(seqnames(overlapping_bcl6),"-",
start(overlapping_bcl6),"-",
end(overlapping_bcl6))
#pdf(file = here::here("scATAC-seq/results/plots/CD4-T/bcl6_heatmaps.pdf"),
# width = 10,
# height = 4)
print(mat_heatmap(seurat = seurat,
features = features,
group = "annotation_paper",
cutree_ncols = 3,cutree_nrows = 1))
#dev.off()
prdm1 <- hg38.gene.annot.2000.GR[which(hg38.gene.annot.2000.GR$gene_name %in% "PRDM1"),]
prdm1_gr <- makeGRangesFromDataFrame(prdm1)
prdm1_plot <-CoveragePlot(
object = seurat,
region = prdm1_gr)
prdm1_plot
overlapping_prdm1 <- seurat_peaks[queryHits(findOverlaps(seurat_peaks, prdm1_gr)),]
features <- paste0(seqnames(overlapping_prdm1),"-",
start(overlapping_prdm1),"-",
end(overlapping_prdm1))
#pdf(file = here::here("scATAC-seq/results/plots/CD4-T/prdm1_heatmaps.pdf"),
# width = 10,
# height = 4)
print(mat_heatmap(seurat = seurat,
features = features,
group = "annotation_paper",
cutree_ncols = 3,cutree_nrows = 1))
#dev.off()
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS/LAPACK: /Users/pauli/opt/anaconda3/envs/Motif_TF/lib/libopenblasp-r0.3.10.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] biomaRt_2.44.4 rio_0.5.16 plyr_1.8.6 writexl_1.4.0 ggpubr_0.4.0 data.table_1.14.0 forcats_0.5.0 stringr_1.4.0 purrr_0.3.4 readr_1.4.0 tidyr_1.1.3 tibble_3.1.2 tidyverse_1.3.0 ggplot2_3.3.5 dplyr_1.0.7 TFBSTools_1.26.0 JASPAR2020_0.99.10 pheatmap_1.0.12 GenomicRanges_1.40.0 GenomeInfoDb_1.24.2 IRanges_2.22.1 S4Vectors_0.26.0 BiocGenerics_0.34.0 Signac_1.2.1 SeuratObject_4.0.2 Seurat_4.0.3 BiocStyle_2.16.1
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.3 SnowballC_0.7.0 rtracklayer_1.48.0 scattermore_0.7 R.methodsS3_1.8.1 bit64_4.0.5 knitr_1.30 irlba_2.3.3 DelayedArray_0.14.0 R.utils_2.10.1 rpart_4.1-15 KEGGREST_1.28.0 RCurl_1.98-1.2 generics_0.1.0 cowplot_1.1.1 RSQLite_2.2.1 RANN_2.6.1 future_1.21.0 bit_4.0.4 spatstat.data_2.1-0 xml2_1.3.2 lubridate_1.7.9 httpuv_1.6.1 SummarizedExperiment_1.18.1 assertthat_0.2.1 DirichletMultinomial_1.30.0 xfun_0.18 hms_0.5.3 evaluate_0.14 promises_1.2.0.1 fansi_0.5.0 progress_1.2.2 caTools_1.18.2 dbplyr_1.4.4 readxl_1.3.1 igraph_1.2.6 DBI_1.1.0 htmlwidgets_1.5.3 sparsesvd_0.2 spatstat.geom_2.2-0 ellipsis_0.3.2 backports_1.1.10
## [43] bookdown_0.21 annotate_1.66.0 deldir_0.2-10 vctrs_0.3.8 Biobase_2.48.0 here_1.0.1 ROCR_1.0-11 abind_1.4-5 withr_2.4.2 ggforce_0.3.2 BSgenome_1.56.0 sctransform_0.3.2 GenomicAlignments_1.24.0 prettyunits_1.1.1 goftest_1.2-2 cluster_2.1.0 lazyeval_0.2.2 seqLogo_1.54.3 crayon_1.4.1 pkgconfig_2.0.3 slam_0.1-47 labeling_0.4.2 tweenr_1.0.1 nlme_3.1-150 rlang_0.4.11 globals_0.14.0 lifecycle_1.0.0 miniUI_0.1.1.1 BiocFileCache_1.12.1 modelr_0.1.8 cellranger_1.1.0 rprojroot_2.0.2 polyclip_1.10-0 matrixStats_0.59.0 lmtest_0.9-38 Matrix_1.3-4 ggseqlogo_0.1 carData_3.0-4 zoo_1.8-9 reprex_0.3.0 ggridges_0.5.3 png_0.1-7
## [85] viridisLite_0.4.0 bitops_1.0-7 R.oo_1.24.0 KernSmooth_2.23-17 Biostrings_2.56.0 blob_1.2.1 parallelly_1.26.1 rstatix_0.6.0 ggsignif_0.6.0 CNEr_1.24.0 scales_1.1.1 memoise_1.1.0 magrittr_2.0.1 ica_1.0-2 zlibbioc_1.34.0 compiler_4.0.3 RColorBrewer_1.1-2 fitdistrplus_1.1-5 Rsamtools_2.4.0 cli_3.0.0 XVector_0.28.0 listenv_0.8.0 patchwork_1.1.1 pbapply_1.4-3 MASS_7.3-53 mgcv_1.8-33 tidyselect_1.1.1 stringi_1.6.2 yaml_2.2.1 askpass_1.1 ggrepel_0.9.1 grid_4.0.3 fastmatch_1.1-0 tools_4.0.3 future.apply_1.7.0 rstudioapi_0.11 TFMPvalue_0.0.8 foreign_0.8-80 lsa_0.73.2 gridExtra_2.3 farver_2.1.0 Rtsne_0.15
## [127] digest_0.6.27 BiocManager_1.30.10 shiny_1.6.0 pracma_2.2.9 qlcMatrix_0.9.7 Rcpp_1.0.6 car_3.0-10 broom_0.7.2 later_1.2.0 RcppAnnoy_0.0.18 httr_1.4.2 AnnotationDbi_1.50.3 colorspace_2.0-2 rvest_0.3.6 XML_3.99-0.3 fs_1.5.0 tensor_1.5 reticulate_1.20 splines_4.0.3 uwot_0.1.10 RcppRoll_0.3.0 spatstat.utils_2.2-0 plotly_4.9.4.1 xtable_1.8-4 jsonlite_1.7.2 poweRlaw_0.70.6 R6_2.5.0 pillar_1.6.1 htmltools_0.5.1.1 mime_0.11 glue_1.4.2 fastmap_1.1.0 BiocParallel_1.22.0 codetools_0.2-17 utf8_1.2.1 lattice_0.20-41 spatstat.sparse_2.0-0 curl_4.3.2 leiden_0.3.8 gtools_3.9.2 zip_2.1.1 GO.db_3.11.4
## [169] openxlsx_4.2.4 openssl_1.4.4 survival_3.2-7 rmarkdown_2.5 docopt_0.7.1 munsell_0.5.0 GenomeInfoDbData_1.2.3 haven_2.3.1 reshape2_1.4.4 gtable_0.3.0 spatstat.core_2.2-0